---
title: "DADA2 - pooled"
author: "Fabian Roger"
date: "1/16/2019"
output: html_notebook
---
##libraries

```{r, message = FALSE}
library(tidyverse)
library(knitr)
library(dada2)
library(vegan)
library(DECIPHER)
library(phyloseq)
library(gridExtra)
library(ape)
```

This script follows the [DADA2 Pipeline Tutorial (1.8)](http://benjjneb.github.io/dada2/tutorial.html) 
The original article can be found [here](http://rdcu.be/ipGh)

```{r}
packageVersion("dada2")
```

## data
```{r}

data.frame(`Sequencing centre` = "",
           Project = "",  
           `data delivery` = "",
           Samples = "80",
           `Sequencing amount` = "",
           `Sequencing type`  = "300 bp paired-end read (Illumina MiSeq V3)",
           `Amplicon type` = "Prokaryota 16S (U341F-U805R)") 
  
```

```{r}
meta <- read.delim("good_all_sampleInfo_simplenames.txt")
summary(meta)
```

##define path

define path variable that points to extracted, trimmed, unmerged, sequencing reads

```{r, "path"}

#path to demultiplexed and primer trimmed reads
path <- c("Sequencing_Data/cutadapt/")

# extract parent path
ParentPath <- sub("(.+/).+/$", "\\1", path)

#path to plots
plotpath <- paste(ParentPath, "plots", sep = "")

# make directory for plots
dir.create( path =  plotpath)

#path to filtered reads
filterpath <- paste(ParentPath, "DADA_filtered", sep = "")

# make directory for filtered files
dir.create( path =  filterpath)

# list files
fns <- list.files(path)

data.frame(forward = fns[grepl("R1", fns)],
           reverse = fns[grepl("R2", fns)]) #%>% 
  #kable(caption = "Files in directory")
  
```

## read files
First we read in the file names for all the fastq files and do a little string manipulation to get lists of the forward and reverse fastq files in matched order:

```{r, "sort fwrd and rev", cache=TRUE}
fastqs <- fns[grepl(".fastq.gz$", fns)] # if non fastq files in directory
fastqs <- sort(fastqs) # sort to ensures forward/reverse reads are in same order
fnFs <- fastqs[grepl("_R1", fastqs)] # Just the forward read files
fnRs <- fastqs[grepl("_R2", fastqs)] # Just the reverse read files

# Get sample names from the first part of the forward read filenames
sample.names <- sapply(strsplit(fnFs, "_"), `[`, 1)

# Fully specify the path for the fnFs and fnRs
fnFs <- paste0(path, fnFs)
fnRs <- paste0(path, fnRs)

```

Visualize the quality profile of the reads

## quality profile

```{r, "plot quality of reads", cache=TRUE, warning=FALSE, message=FALSE}
# plot one foward and one reverse read in the rapport
plotQualityProfile(fnFs[[1]])+
  scale_y_continuous(limits = c(0,40))+
  scale_x_continuous(breaks = seq(0,300, 20))+
  geom_hline(yintercept = 30, colour = "grey", linetype = "dashed")+
  geom_hline(yintercept = 20, colour = "grey", linetype = "dashed")+
  labs(title = paste(sample.names[1], "foward", sep = " - " ))

plotQualityProfile(fnRs[[1]])+
  scale_y_continuous(limits = c(0,40))+
  scale_x_continuous(breaks = seq(0,300, 20))+
  geom_hline(yintercept = 30, colour = "grey", linetype = "dashed")+
  geom_hline(yintercept = 20, colour = "grey", linetype = "dashed")+
  labs(title = paste(sample.names[1], "reverse", sep = " - "))

plotlistF <- list()

for( i in  seq_along(fnFs)) {
  plot <-  plotQualityProfile(fnFs[[i]])+
  scale_y_continuous(limits = c(0,40))+
  scale_x_continuous(breaks = seq(0,300, 20))+
  geom_hline(yintercept = 30, colour = "grey", linetype = "dashed")+
  geom_hline(yintercept = 20, colour = "grey", linetype = "dashed")+
  labs(title = paste(sample.names[i], "foward", sep = " - " ))
  plotlistF[[i]] <- plot
}


plotlistR <- list()

for( i in  seq_along(fnRs)) {
  plot <-  plotQualityProfile(fnRs[[i]])+
  scale_y_continuous(limits = c(0,40))+
  scale_x_continuous(breaks = seq(0,300, 20))+
  geom_hline(yintercept = 30, colour = "grey", linetype = "dashed")+
  geom_hline(yintercept = 20, colour = "grey", linetype = "dashed")+
  labs(title = paste(sample.names[i], "reverse", sep = " - "))
  plotlistR[[i]] <- plot
}


# interleave the two lists:

plotlist <- c(plotlistF, plotlistR)[order(c(seq_along(plotlistF), seq_along(plotlistR)))]

ggsave(paste(plotpath, "quality_plots.pdf", sep = "/"), marrangeGrob(grobs = plotlist, nrow=2, ncol=1), height = 10, width = 8)


```

There are no adapters / random bases / primers left on the sequences. 

We choose our other trimming parameters by inspecting the quality profiles. 

+ maxN=0 (DADA2 requires no Ns)
+ truncQ=25 
+ maxEE=(2, 5)

The `maxEE` parameter sets the maximum number of “expected errors” allowed in a read. 

We use the `fastqPairedFilter` function to jointly filter the forward and reverse reads.


We sequenced the **V3/V4** region of the **16S** gene, with the primers **341F** and **805R**

```{r}

data.frame(`.` = c( "sequence","direction","length"),
           U341F = c( "CCTACGGGNGGCWGCAG", "foward", nchar("CCTACGGGNGGCWGCAG")),
           U805R = c( "GACTACHVGGGTATCTAATCC", "reverse", nchar("GACTACHVGGGTATCTAATCC")))# %>% 
 # kable(caption = "primer sequences")

```


We used the 341F (CCTACGGGNGGCWGCAG) and 805R (GACTACHVGGGTATCTAATCC), [Herlemann 2011](https://www.nature.com/articles/ismej201141) which amplified a DNA fragment of 464bp length flanking the V3 and V4 regions of the 16S rRNA gene. 

## filter and trim

As we cut the primers, the expected fragment is `r 464 - 17 -20` ~425bp long. The forward reads are 283 bp long and the reverse reads 279 this gives an overlap of 137 bp. To ensure an overlap of at least 50bp, we can remove up to 87 bp in total. 

The forward reads are of good quality and drop only under a quality score of under 20 at the very end. We keep the first 270 bp. The reverse reads are of less good quality and drop below a quality score of 20 after 200 bp. We cut after 200 basepairs. We thus remove a total of 13 + 79 = 92pb, ensuring an overlap of 45 bp. 


```{r, "filter", cache=TRUE}

# Make filenames for the filtered fastq files
filtFs <- paste(paste(filterpath, sample.names,sep="/"), "_F_filt.fastq.gz", sep = "")
filtRs <- paste(paste(filterpath, sample.names,sep="/"), "_R_filt.fastq.gz", sep = "")

# Filter
out <- filterAndTrim(fnFs, filtFs, fnRs, filtRs,
              truncLen=c(270,200),
              maxN=0, 
              maxEE=c(2,5),
              truncQ=2,
              rm.phix=TRUE,
              compress=TRUE,
              verbose=TRUE,
              multithread = TRUE)

out %>% as.data.frame %>% mutate(prct = round(reads.out / reads.in * 100)) %>% pull(prct) %>% quantile()

```

#learn errors

Learn the Error Rates

The DADA2 algorithm depends on a parametric error model (err) and every amplicon dataset has a different set of error rates. The learnErrors method learns the error model from the data, by alternating estimation of the error rates and inference of sample composition until they converge on a jointly consistent solution. As in many optimization problems, the algorithm must begin with an initial guess, for which the maximum possible error rates in this data are used (the error rates if only the most abundant sequence is correct and all the rest are errors).



```{r, "learnerrors", cache=TRUE}
errF <- learnErrors(filtFs, multithread=TRUE)
errR <- learnErrors(filtRs, multithread=TRUE)
```

Visualize estimated error rates:

```{r, cache=TRUE}
plotErrors(errF, nominalQ=TRUE)
ggsave(paste(plotpath, "errorPlot_F.pdf", sep = "/"))

plotErrors(errR, nominalQ=TRUE)
ggsave(paste(plotpath, "errorPlot_R.pdf", sep = "/"))
```
   
   
## dereplicate
```{r, "Dereplication", cache=TRUE}

derepFs <- derepFastq(filtFs, verbose=TRUE)
derepRs <- derepFastq(filtRs, verbose=TRUE)


# Name the derep-class objects by the sample names
names(derepFs) <- sample.names
names(derepRs) <- sample.names

```


# denoise
Perform joint sample inference and error rate estimation:

We (pseudo) *pool* all samples for sequence inference to detect rare sequences across samples

see ?dada for details

```{r, "sample.inference", cache=TRUE}
dadaFs <- dada(derepFs, err=errF, pool = "pseudo", multithread = TRUE)
dadaRs <- dada(derepRs, err=errR, pool = "pseudo", multithread = TRUE)
```

```{r}
dadaFs[1]
```

##merge 

Spurious sequence variants are further reduced by merging overlapping reads. 


```{r, "merging reads", cache=TRUE}

mergers <- mergePairs(dadaFs, derepFs, dadaRs, derepRs, verbose=TRUE)

# Inspect the merger data.frame from the first sample
head(mergers[[1]])


```


##Construct sequence table:

```{r, "Raw seqeunce table"}
seqtab <- makeSequenceTable(mergers)
dim(seqtab)

write_rds(seqtab, paste(ParentPath, "Seq_table_raw_EB.RDS", sep = ""))
seqtab <- readRDS(paste(ParentPath, "Seq_table_raw_EB.RDS", sep = ""))

save.image(paste(ParentPath, "DADA2_210607_rawseq.RData", sep = ""))

```


```{r, "Seqeunce table"}
data.frame(table(nchar(colnames(seqtab)))) %>% 
  ggplot( aes(x = Var1, y = Freq))+
  geom_bar(stat = "identity") +
  theme(axis.text.x = element_text(angle = -90))+
  labs(x = "length in bp", title = "histogramm of merged sequence lenghts")

ggsave(paste(plotpath, "seq_len_unfiltered.pdf", sep = "/"), width = 8, height = 6)
 

```

## length filtering
*optional*

remove sequences longer than 431 and shorter that 399 bp

```{r, "filter seqeunces by length"}

seq_wrong_len <- colnames(seqtab)[(nchar(colnames(seqtab)) > 431 | 
                                     nchar(colnames(seqtab)) < 399)]

seqtab_rl <- seqtab[, ! colnames(seqtab) %in% seq_wrong_len]
dim(seqtab_rl)
```

this removes `r (length(seq_wrong_len) / ncol(seqtab))*100` % of the unique sequences, accounting for 
`r (sum(seqtab[,seq_wrong_len]) / sum(seqtab))*100` % of the reads

```{r}
data.frame(table(nchar(colnames(seqtab_rl)))) %>% 
  ggplot( aes(x = Var1, y = Freq))+
  geom_bar(stat = "identity") +
  theme(axis.text.x = element_text(angle = -90))+
  labs(x = "length in bp", title = "histogramm of merged sequence lenghts")

ggsave(paste(plotpath, "seq_len_filtered.pdf", sep = "/"), width = 8, height = 6)
```

percentage of excluded reads: `r (1-(sum(seqtab_rl)/sum(seqtab)))*100`

percentage of excluded ASVs: `r (1-(ncol(seqtab_rl)/ncol(seqtab)))*100`


##remove Chimeras

```{r, "Chimera removal", cache=TRUE}

seqtab.nochim <- removeBimeraDenovo(seqtab_rl, method="consensus", multithread=TRUE, verbose=TRUE)
dim(seqtab.nochim)

```

percentage of Chimeric reads: `r (1-(sum(seqtab.nochim)/sum(seqtab_rl)))*100`

percentage of Chimeric ASVs: `r (1-(ncol(seqtab.nochim)/ncol(seqtab_rl)))*100`



## track reads

how much reads lost during cutadapt?


```{r}
getN <- function(x) sum(getUniques(x))
track <- cbind(out, sapply(dadaFs, getN), sapply(mergers, getN), rowSums(seqtab), rowSums(seqtab_rl), rowSums(seqtab.nochim))
colnames(track) <- c("trimmed", "filtered", "denoised", "merged", "tabled", "lentgh_filtered", "nonchim")
rownames(track) <- sample.names


track %>% 
  as.data.frame %>% 
  tibble::rownames_to_column(var="sample") %>% 
  mutate_at(vars(-sample), funs(prct = (./trimmed)*100)) %>% 
  select(sample,contains("prct")) %>% 
  gather(step, reads, -sample) %>% 
  mutate(step = factor(step, levels = c("trimmed_prct", "filtered_prct", 
                                        "denoised_prct", "merged_prct", 
                                        "tabled_prct", "lentgh_filtered_prct", "nonchim_prct"))) %>% 
  ggplot(aes(x=step, y=reads, group = sample))+
  geom_line(position = position_jitter(width = 0.1), colour = "grey", alpha = 0.3)+
  geom_point(position = position_jitter(width = 0.1), colour = "black",
             alpha = 0.3, shape = 21)+
  geom_violin(fill = NA, aes(group = step))+
  stat_summary(geom="line", fun.y="median", aes(group = 1))+
  scale_y_continuous(breaks = seq(0,100,10), limits = c(0,102))+
  theme_bw()+
  theme(axis.text.x = element_text(angle = -30, hjust = 0))

ggsave(paste(plotpath, "sequence_filtering.pdf", sep = "/"))
```


## assign Taxonomy 

We assign the taxonomy with the Silva reference file for two reasons:

+ it seems to have more Eukaryota / mitochondria / chloroplast sequences so that it correctly identify those in our samples
+ it produces fewer NA alignments than the RDP ref (although considerably more than gg at Class level) 

check [here](http://benjjneb.github.io/dada2/training.html) for newer releases. It is probably a good idea to replace the file with the newest available release. 

We use minBoot = 80% for the assignment threshold as recommended for sequences longer than 250 bp 

```{r, cache=TRUE}

training_path <- "../17_Emma_algea_flies/silva_nr99_v138.1_wSpecies_train_set.fa"

taxa <- assignTaxonomy( seqtab.nochim, 
                        training_path, 
                        multithread=TRUE,
                        minBoot = 80)



taxa.print <- taxa # Removing sequence rownames for display only
rownames(taxa.print) <- NULL
head(taxa.print, 20)
```

## check for non-microbial sequences
As we work with environmental samples we might have spurious non-bacterial sequences remaining, including 

+ Eukaryota
+ Chloroplast
+ Rickettsiales (=Mitochondria)

We will check how many sequences these represent

Which samples contain (sample name & sum of the sequences shown)

+ Chloroplasts

```{r}
rowSums(seqtab.nochim[, rownames(which(taxa == "Chloroplast", arr.ind=T)), drop = F])[which(rowSums(seqtab.nochim[, rownames(which(taxa == "Chloroplast", arr.ind=T)), drop = F]) > 0)]
```

+ Mitochondria

```{r}

rowSums(seqtab.nochim[, rownames(which(taxa == "Mitochondria", arr.ind=T)), drop = F])[which(rowSums(seqtab.nochim[, rownames(which(taxa == "Mitochondria", arr.ind=T)), drop = F]) > 0)]

```

+ Eukaryota

```{r}
rowSums(seqtab.nochim[, rownames(which(taxa == "Eukaryota", arr.ind=T)), drop = F])[which(rowSums(seqtab.nochim[, rownames(which(taxa == "Eukaryota", arr.ind=T)), drop = F]) > 0)]
```

in total 

```{r}
(sum(seqtab.nochim[, rownames(which(taxa == "Eukaryota" | taxa == "Mitochondria" | taxa == "Chloroplast", arr.ind=T))]) / sum(seqtab.nochim)) * 100
```

 % reads belong to either to the above groups, corresponding to 

```{r}
length(rownames(which(taxa == "Eukaryota" | taxa == "Mitochondria" | taxa == "Chloroplast", arr.ind=T))) / ncol(seqtab.nochim) * 100
```

% of the sequences. 


The sequences are removed from both the taxa table and the OTU table

Moreover, `r length(which(is.na(taxa[,1])))` sequences are not assigned at Kingdom level and are also removed. 
 

```{r}
exclude_seq <- rownames(which(taxa == "Eukaryota" | taxa == "Mitochondria" | taxa == "Chloroplast", arr.ind=T))
exclude_seq <- c(exclude_seq, names( which( is.na( taxa[,1]))) )

taxa_clean <- taxa[which(! rownames(taxa) %in% exclude_seq), ]
seqtab.nochim_clean <- seqtab.nochim[, which(! colnames(seqtab.nochim) %in% exclude_seq) ]

dim(seqtab.nochim_clean)
```

## rename sequences
```{r, "export seqeunces as fasta file"}
seqs <- getSequences(seqtab.nochim_clean)

#name the sequences seq1 to n but keep track of them 
seqnames <- data.frame(name_short = 
                         paste("seq", formatC(c(1:length(seqs)), 
                                                width = nchar(length(seqs)),
                                                flag = "0"),
                               sep = "_"),
                       sequence = seqs)

names(seqs) <- seqnames$name_short

seqs <- DNAStringSet(seqs)
```


```{r}
#dataframe to store sequence names

Seq_to_name <- 
  as.character(seqs) %>% 
  bind_rows(.id = "seq") %>% 
  pivot_longer(cols = everything(),
               names_to = "seq", 
               values_to = "sequence")

Seq_to_name <- Seq_to_name[-1,]
```


```{r}
#rename taxaonomy file
taxa_clean <-
taxa_clean %>%
  as.data.frame() %>% 
  rownames_to_column(var = "sequence") %>% 
  left_join(Seq_to_name) %>% 
  select(-sequence) %>% 
  relocate(seq)

#rename ASV table
ASV_table_clean <- seqtab.nochim_clean 
colnames(ASV_table_clean) <-  Seq_to_name[match(colnames(ASV_table_clean), Seq_to_name$sequence),]$seq
```




## align seqs

We make a global alignment of the sequences with the `DECIPHER` package

for that we make a multiple sequence alignment with `AlignSeqs`. 


```{r}
seqs <- getSequences(seqtab.nochim_clean)
names(seqs) <- Seq_to_name[match(seqs, Seq_to_name$sequence),]$seq # This propagates to the tip labels of the tree
seqs <- DNAStringSet(seqs)
seqs <- OrientNucleotides(seqs, verbose = F)

msa <- AlignSeqs(seqs, iterations = 10, refinements = 10, verbose = FALSE, processors = 15) # make multiple sequence alignment

#BrowseSeqs(msa, htmlFile=paste(plotpath, "msa_alignment.html", sep = "/"), openURL = F, highlight = 1)

```

after the alignment, we also inspect a distance - rank plot of the sequences. If there are clear outliers in the histogram, it is likely that we still have non-biological sequences in the dataset that need to be removed. 

```{r}
Dist <- DistanceMatrix(msa, verbose = FALSE, processors = 15)

sort(colSums(Dist), decreasing = T)[1:1000] %>% 
  as.data.frame() %>%
  rename(`.` = "dist") %>% 
  rownames_to_column(var = "seq") %>% 
  left_join(taxa_clean) %>% 
  mutate(rank = 1:n()) %>% 
  ggplot(aes(x = rank, y = dist, colour = Phylum))+
  geom_point(position = position_jitter(width = 10, height = 10), size = 0.5)+
  geom_abline(intercept = 5500, 
              slope = 0,
              colour = "darkred")+
  labs(y = "colSums(Dist)",
       x = "Sequences (first 1000, decreasing order of total distance)",
       title = "Sum of Distances for all Sequences")

ggsave(paste(plotpath, "Seq_distance_raw.jpg", sep = "/"), width = 8, height = 6)


distSeqs <- colSums(Dist)[which(colSums(Dist) > 5500)] 
```

We can see that `r length(distSeqs)` Sequences are much more different from all other sequences that the remaining sequences are on average. This makes it likely that these sequences do not represent true biological sequences (e.g. undetected chimeras). This is further supported by the vast majority of these sequences not be assigned at Phylum level. 

excluding these sequences removes `r (length(distSeqs) / ncol(seqtab.nochim_clean)) * 100` % of the ASVs and
`r (sum(seqtab.nochim_clean[,Seq_to_name[Seq_to_name$seq %in% names(distSeqs),]$sequence]) / sum(seqtab.nochim_clean)) * 100` % of the reads


```{r}

ASV_table_clean_distfilt <- ASV_table_clean[, ! colnames(ASV_table_clean) %in% names(distSeqs)]

taxa_clean_distfilt <- filter(taxa_clean, ! seq %in% names(distSeqs))
seqs_filt <- seqs[!names(seqs) %in% names(distSeqs)]


writeXStringSet(seqs_filt, file= paste(ParentPath, "ASVs_clean.fasta", sep =""))
write_rds(ASV_table_clean_distfilt, paste(ParentPath, "ASVs_table_clean.RDS", sep =""))
write_tsv(taxa_clean_distfilt, paste(ParentPath, "Tax_table_clean.txt", sep =""))

```

##cluster sequences

Vsearch clustering

test clustering threshold
```{r}

RES <- tibble(ID = numeric(),
       n_cluster = numeric())

for (i in seq(0.9, 1, 0.01)){
  
  system(paste(
    "vsearch --cluster_fast /media/data3/Fabian/Emma/17_Emma_algea_flies/Sequencing_Data/ASVs_clean.fasta --centroids /media/data3/Fabian/Emma/17_Emma_algea_flies/Sequencing_Data/ASVs_clean_clustered.fasta --id ", i))
  
  L <- system("grep -c '^>' /media/data3/Fabian/Emma/17_Emma_algea_flies/Sequencing_Data/ASVs_clean_clustered.fasta", intern = T) 
  
  RES_temp <- tibble(ID = i,
                     n_cluster = as.numeric(L))
  
  RES <- rbind(RES, RES_temp)
  
}

ggplot(RES, aes(x = ID, y = n_cluster))+
  geom_point()

ggsave(paste(plotpath, "clusterthreshold.pdf", sep = "/"), width = 6, height = 6)

```

We will cluster the ASVs at 99% similarity as we had quite lax quality filtering and there might be remaining sequencing errors. 

```{r}

system("vsearch --cluster_fast /media/data3/Fabian/Emma/17_Emma_algea_flies/Sequencing_Data/ASVs_clean.fasta --centroids /media/data3/Fabian/Emma/17_Emma_algea_flies/Sequencing_Data/ASVs_99_clustered.fasta --sizein --sizeout --id 0.99 --uc /media/data3/Fabian/Emma/17_Emma_algea_flies/Sequencing_Data/ASVs_99_clustered.txt")
```

The COI_99_cluster.txt file tells us which sequences have been clustered

column 09 gives the original sequence name
column 10 gives the centroid sequence to which it has been clustered
column 01 tells us if the sequence is itself a centroid sequence ("S") or clustered to a centroid ("H")
column 04 gives the % identity with the centroid (here between 100% and 99%)

```{r}

Cluster_ID <- read_tsv("Sequencing_Data/ASVs_99_clustered.txt", col_names = FALSE)

#filter only sequences that have been clustered
Cluster_ID <- 
Cluster_ID %>% filter(X1 != "C") %>% 
  dplyr::rename(seq = X9, Type = X1, match = X10, pident = X4) %>% 
  select(seq, Type, match, pident) %>% 
  filter(Type == "H" ) %>% 
  arrange(match)

#strip abundance info from seqnames
Cluster_ID <- 
Cluster_ID %>% 
  mutate(seq = gsub("(.?);size.+", "\\1", seq)) %>% 
  mutate(match = gsub("(.?);size.+", "\\1", match))

#split by cluster centroid
Cluster_list <- split(Cluster_ID, Cluster_ID$match)


#sum abundance of clustered sequences
ASV_table_list <- 
  lapply(Cluster_list, function(x) {
  S_seq <- unique(x$match)
  C_seqs <- x$seq
  ASV_table_clean_distfilt[,S_seq] <- rowSums(ASV_table_clean_distfilt[,c(S_seq,C_seqs)])
})

#create new ASV table with only the centroid sequences
ASV_table_C <- ASV_table_clean_distfilt
ASV_table_C[,names(ASV_table_list)] <- t(bind_rows(ASV_table_list))
ASV_table_C <- ASV_table_C[, which(! colnames(ASV_table_C) %in% Cluster_ID$seq)] 


```

## merge  taxonomy

make consensus taxonomy for each cluster (in case taxonomy diverges)

1) I get a list for the taxonomic assignments for all sequences of the same cluster
2) I extract the unique taxonomies
3) I set non-congruent ranks to NA and get a consensus taxonomy

this works well but there are 5 unsual cases where the taxa is the same for the species but differs at genus level and sometime higher, which produces taxonomies were higher taxonomic level is NA but species level is not

I choose to ignore this problem as we don't work much with the lower taxonomic ranks anyway. The solution is to check when the first NA in the taxonomic assignment occurs and set all lower ranks to NA but this not implement here.

```{r}

#1
taxa_list <- 
  lapply(Cluster_list, function(x) {
  S_seq <- unique(x$match)
  C_seqs <- x$seq
  tax_cluster <- filter(taxa_clean_distfilt, seq %in% c(S_seq,C_seqs))
})

#2
taxa_unique <- 
taxa_list %>% 
  lapply(function(x){select(x, -seq) %>% 
      summarise(across(everything(), function(x) {
        z <- unique(na.omit(x))
        if(length(z) == 0) {z <- NA}
        return(z)
        })) %>% 
      distinct()
    })

#3
taxa_consensus <-
taxa_unique %>% 
  lapply(., function(x) {
    summarise(x, across(everything(), function(y){
      z <- unique(y)
      if(length(z) > 1) {z <- NA}
      return(z)
    }))
  })

#consensus for cluster > 1
taxa_consensus <- bind_rows(taxa_consensus, .id = "seq")

# subset taxonomy to cluster centroids
taxa_clust <- filter(taxa_clean, seq %in% colnames(ASV_table_C))

# exclud taxonomy for clusters > 1
taxa_clust <- filter(taxa_clust, ! seq %in% taxa_consensus$seq)

# join consensus taxonomy for clusters > 1
taxa_clust <- 
  rbind(taxa_clust, taxa_consensus) %>% 
  arrange(seq)

```

##prevalance filtering


```{r}
ASV_table_C_filt <- ASV_table_C[, colSums(ASV_table_C) >= 5]
ASV_table_C_filt <- ASV_table_C_filt[, colSums(ASV_table_C_filt > 0) > 1]

dim(ASV_table_C_filt)
```

percentage of excluded reads: `r (1-(sum(ASV_table_C_filt)/sum(ASV_table_C)))*100`

percentage of Chimeric ASVs: `r (1-(ncol(ASV_table_C_filt)/ncol(ASV_table_C)))*100`

```{r}
#export ASV table
write_tsv(ASV_table_C_filt, paste(ParentPath, "ASV_table_99.txt", sep =""))
write_rds(ASV_table_C_filt, paste(ParentPath, "ASV_table_99.RDS", sep =""))
#ASV_table_C <- read_rds(paste(ParentPath, "ASV_table_99.RDS", sep =""))


#export centroid fasta file
seqs_C <- seqs_filt[colnames(ASV_table_C_filt)]
writeXStringSet(seqs_C, paste(ParentPath, "ASV_99.fasta", sep =""))

#export consensus taxonomy
taxa_clust_filt <- filter(taxa_clust, seq %in% colnames(ASV_table_C_filt))

write_tsv(taxa_clust, paste(ParentPath, "Tax_table_99.txt", sep =""))
```


## phylogeny

We make a global alignment of the clustered sequences with the `DECIPHER` package

for that we make a multiple sequence alignment with `AlignSeqs`. 


```{r}
msa_C <- AlignSeqs(seqs_C, iterations = 10, refinements = 10, verbose = FALSE, processors = 15) # make multiple sequence alignment

```

after the alignment, we also inspect a distance - rank plot of the sequences. If there are clear outliers in the histogram, it is likely that we still have non-biological sequences in the dataset that need to be removed. 

```{r}
Dist_C <- DistanceMatrix(msa_C, verbose = FALSE, processors = 15)

sort(colSums(Dist_C), decreasing = T)[1:1000] %>% 
  as.data.frame() %>%
  rename(`.` = "dist") %>% 
  rownames_to_column(var = "seq") %>% 
  left_join(taxa_clean) %>% 
  mutate(rank = 1:n()) %>% 
  ggplot(aes(x = rank, y = dist, colour = Phylum))+
  geom_point(position = position_jitter(width = 10, height = 10), size = 0.5)+
  geom_abline(intercept = 5500, 
              slope = 0,
              colour = "darkred")+
  labs(y = "colSums(Dist)",
       x = "Sequences (first 1000, decreasing order of total distance)",
       title = "Sum of Distances for all Sequences")


#distSeqs <- colSums(Dist)[which(colSums(Dist) > 5500)] 
```



We also follow [the advice](https://www.bioconductor.org/packages/devel/bioc/vignettes/DECIPHER/inst/doc/ArtOfAlignmentInR.pdf) to Stagger the alignment for more accurate Tree building

>To mitigate the problem of false homologies, StaggerAlignment will automatically generate a staggered version of an existing alignment. Staggered alignments separate potentially non-homologous regions into separate columns of the alignment. The result is an alignment that is less visually appealing, but likely more accurate in a phylogenetic sense. As such, this is an important post-processing step whenever the alignment will be used to construct a phylogenetic tree


```{r}
msa_stag <- StaggerAlignment(msa_C, verbose = FALSE)


#BrowseSeqs(msa_stag, htmlFile=paste(plotpath, "msa_alignment_staggered.html", sep = "/"), openURL = F, highlight = 1)

writeXStringSet(msa_stag, file= paste(ParentPath, "ASV_99_aligned.fasta", sep ="/"))


#msa_stag_f <- msa_stag[!names(msa_stag) %in% names(susp_seq)]

```

We will use [FastTree](http://meta.microbesonline.org/fasttree/) to estimate an approximately-maximum-likelihood phylogenetic tree. It is orders of magnitudes faster than the corresponding R implementation phangorn. Also, because the downstream estimation of phylogenetic diversity requires an ultrametric tree, we use [PATHd8](http://www2.math.su.se/PATHd8/) for the phylogenetic dating. 

note that PATH8 produces a big file with lots of additional information that is not very relevant in our case. Therefore we grep the tree from the produced output file and save it separately. 

For this command to work you need to have FastrTree installed and in your path.

```{r}
system(paste("FastTree -gtr -nt",
              paste(ParentPath, "ASV_99_aligned.fasta", sep = "/"),
              ">",
              paste(ParentPath, "FastTree.tre", sep = "/")))

TREE <- read.tree(paste(ParentPath, "FastTree.tre", sep = "/"))

Archaea <- 
  filter(taxa_clust, Kingdom == "Archaea") %>% 
    dplyr::slice(1) %>% 
    pull(seq)

TREE <- root(TREE, Archaea)
TREE <- multi2di(TREE)

write.tree(TREE, paste(ParentPath, "rootedTREE.tre", sep = "/"))

#system("/Applications/PATHd8 16S_Data_DADA/rootedTREE.tre 16S_Data_DADA/rootedTREE_um.tre")

#system("grep '^d8 tree' 16S_Data_DADA/rootedTREE_um.tre > 16S_Data_DADA/rootedTREE_um_oT.tre")

plot(TREE,show.tip.label = FALSE)
```


## check technical replicates

```{r}

row.names(meta) <- meta$SampleID

taxa_clust_m <- as.matrix(taxa_clust[,-1])
rownames(taxa_clust_m) <- taxa_clust$seq

ps <- phyloseq(otu_table(ASV_table_C, taxa_are_rows = FALSE),
               tax_table(taxa_clust_m),
               sample_data(meta))

ps_wrack <- prune_samples(as.character(filter(meta, type == "wrack")$SampleID), ps)

GP.ord <- ordinate(ps_wrack, "NMDS", "bray")

plot_ordination(ps_wrack, GP.ord, type="samples", color="population")+
  geom_text_repel(aes(label = SampleID), size = 3, alpha = 0.8)+
  scale_color_brewer(palette = "Set1")+
  theme_bw()

ggsave(paste(plotpath, "wrack_nmds.pdf", sep = "/"), width = 5, height = 4)

```

It looks like sample Skeie-3-2 was mislabeled as it clearly clusters with all the samples from Smyg. All other technical replicates cluster together. 


